This tutorial uses fake data developed through the synthpop package in R.

Setting up R environment

library(tidyverse) # working with datasets
library(specr) # specification curves
library(mice) # multiple imputation
library(semTools) # missing data information

Loading and prepping data

df = read.csv("crea_data.csv") # loading dataset
head(df) # checking data
##      SubID Group       Age Schoolgrade Gender Race Ethnicity     Income
## 1 PA001_V1    DA  7.381246           1      0    5         1  1.8307418
## 2 PA002_V1     C  8.427105           3      1    4         0  7.8539171
## 3 PA003_V1    PI  8.588638           4      0    2         0 12.3572735
## 4 PA007_V1    DC  7.411362           2      0    5         0  0.3388797
## 5 PA008_V1    DA 10.105407           5      1    6         0  1.8353522
## 6 PA009_V1     C 11.904175           6      0    6         1  2.8364534
##   Care_Edu Reporter_relation Attachment_security_8 Attachment_security_4
## 1       19                 1            0.67261422             1.0591693
## 2       16                 1            1.04733955             0.9418006
## 3       18                 1                    NA             0.1202198
## 4       14                 0            1.15440393             1.0591693
## 5       19                 1            1.31500049             1.1765380
## 6       20                 1           -0.02330423            -0.2318863
##          PBS       CBCL     WIAT_W     WIAT_P     WIAT_N  WASI_VOCAB
## 1  0.6079087  2.8662440         NA         NA         NA -2.16061326
## 2 -0.4036524 -0.5215826         NA         NA         NA -0.09780554
## 3  0.3345138 -0.5215826         NA         NA         NA -0.44160683
## 4         NA -0.8980078 -0.2570949 -0.8328702 -0.9655284 -0.52755715
## 5 -1.3878740 -0.8980078  0.6773165  0.8972241  1.2011991  0.24599575
## 6  1.8381858 -0.5215826  0.1517101  0.3411224 -0.3282556 -0.18375586
##   WASI_MATRIX crEA
## 1   1.5240056    1
## 2   0.1809921    0
## 3  -0.1547613    1
## 4   1.5240056    1
## 5  -0.7423298    1
## 6  -0.1547613    0
# correcting variable types (making sure categorical variables are coded as factors)
df = df %>% mutate(
  Gender = as.factor(Gender),
  Race = as.factor(Race),
  Ethnicity =  as.factor(Ethnicity),
)

Checking pattern of missing data

md.pattern(df, rotate.names = TRUE) # pattern of missing data

##     SubID Group Age Schoolgrade Gender Race Ethnicity crEA
## 157     1     1   1           1      1    1         1    1
## 55      1     1   1           1      1    1         1    1
## 1       1     1   1           1      1    1         1    1
## 2       1     1   1           1      1    1         1    1
## 1       1     1   1           1      1    1         1    1
## 2       1     1   1           1      1    1         1    1
## 40      1     1   1           1      1    1         1    1
## 12      1     1   1           1      1    1         1    1
## 9       1     1   1           1      1    1         1    1
## 1       1     1   1           1      1    1         1    1
## 2       1     1   1           1      1    1         1    1
## 4       1     1   1           1      1    1         1    1
## 2       1     1   1           1      1    1         1    1
## 1       1     1   1           1      1    1         1    1
## 1       1     1   1           1      1    1         1    1
## 1       1     1   1           1      1    1         1    1
## 1       1     1   1           1      1    1         1    1
## 1       1     1   1           1      1    1         1    1
## 1       1     1   1           1      1    1         1    1
## 1       1     1   1           1      1    1         1    1
## 1       1     1   1           1      1    1         1    1
## 1       1     1   1           1      1    1         1    1
## 1       1     1   1           1      1    1         1    1
## 1       1     1   1           1      1    1         1    1
## 2       1     1   1           1      1    1         1    1
## 1       1     1   1           1      1    1         1    1
## 1       1     1   1           1      1    1         1    1
## 5       1     1   1           1      1    1         1    1
## 1       1     1   1           1      1    1         1    1
## 2       1     1   1           1      1    1         1    1
## 1       1     1   1           1      1    1         1    1
## 3       1     1   1           1      1    1         1    1
## 1       1     1   1           1      1    1         1    1
## 1       1     1   1           1      1    1         1    1
## 3       1     1   1           1      1    1         1    1
## 1       1     1   1           1      1    1         1    1
## 2       1     1   1           1      1    1         1    1
## 1       1     1   1           1      1    1         1    1
##         0     0   0           0      0    0         0    0
##     Attachment_security_4 WASI_MATRIX WASI_VOCAB Attachment_security_8 Income
## 157                     1           1          1                     1      1
## 55                      1           1          1                     1      1
## 1                       1           1          1                     1      1
## 2                       1           1          1                     1      1
## 1                       1           1          1                     1      1
## 2                       1           1          1                     1      1
## 40                      1           1          1                     1      1
## 12                      1           1          1                     1      1
## 9                       1           1          1                     1      1
## 1                       1           1          1                     1      1
## 2                       1           1          1                     1      1
## 4                       1           1          1                     1      1
## 2                       1           1          1                     1      1
## 1                       1           1          1                     1      1
## 1                       1           1          1                     1      1
## 1                       1           1          1                     1      1
## 1                       1           1          1                     1      0
## 1                       1           1          1                     1      0
## 1                       1           1          1                     1      0
## 1                       1           1          1                     1      0
## 1                       1           1          1                     1      0
## 1                       1           1          1                     1      0
## 1                       1           1          1                     1      0
## 1                       1           1          1                     1      0
## 2                       1           1          1                     1      0
## 1                       1           1          1                     1      0
## 1                       1           1          1                     1      0
## 5                       1           1          1                     0      1
## 1                       1           1          1                     0      1
## 2                       1           1          1                     0      1
## 1                       1           1          1                     0      1
## 3                       1           1          0                     1      1
## 1                       1           1          0                     1      1
## 1                       1           1          0                     1      1
## 3                       1           0          1                     1      1
## 1                       1           0          1                     1      1
## 2                       0           1          1                     1      1
## 1                       0           1          0                     1      1
##                         3           4          6                     9     12
##     Care_Edu Reporter_relation CBCL WIAT_N WIAT_W WIAT_P PBS    
## 157        1                 1    1      1      1      1   1   0
## 55         1                 1    1      1      1      1   0   1
## 1          1                 1    1      1      0      0   1   2
## 2          1                 1    1      1      0      0   0   3
## 1          1                 1    1      0      1      1   1   1
## 2          1                 1    1      0      1      0   1   2
## 40         1                 1    1      0      0      0   1   3
## 12         1                 1    1      0      0      0   0   4
## 9          1                 1    0      1      1      1   1   1
## 1          1                 1    0      1      1      1   0   2
## 2          1                 1    0      0      0      0   1   4
## 4          1                 0    1      1      1      1   1   1
## 2          1                 0    1      1      1      1   0   2
## 1          1                 0    1      0      0      0   1   4
## 1          0                 1    1      1      1      1   1   1
## 1          0                 1    1      1      1      1   0   2
## 1          1                 1    1      1      1      1   1   1
## 1          1                 1    1      1      1      1   0   2
## 1          0                 1    1      1      1      1   1   2
## 1          0                 1    1      1      1      1   0   3
## 1          0                 1    1      0      0      0   1   5
## 1          0                 0    1      1      1      1   1   3
## 1          0                 0    1      1      1      1   0   4
## 1          0                 0    1      0      0      0   0   7
## 2          0                 0    0      1      1      1   1   4
## 1          0                 0    0      1      1      1   0   5
## 1          0                 0    0      1      0      0   0   7
## 5          1                 1    1      1      1      1   1   1
## 1          1                 1    1      1      1      1   0   2
## 2          1                 1    1      0      0      0   1   4
## 1          1                 1    0      1      1      1   0   3
## 3          1                 1    1      1      1      1   1   1
## 1          1                 1    1      1      0      0   0   4
## 1          1                 1    1      0      0      0   1   4
## 3          1                 1    1      1      1      1   1   1
## 1          1                 1    1      0      0      0   0   5
## 2          1                 1    1      1      1      1   0   2
## 1          1                 1    1      1      1      1   1   2
##           12                14   17     64     66     68  85 360
mean(is.na(df$PBS)) # checking percentage of missing data
## [1] 0.2623457
mean(is.na(df$WIAT_N))
## [1] 0.1975309

First initial imputations

ini = mice(df, seed = 329, print = F, m = 5) # dry run of imputations to see how it works

summary(ini) # summary of imputed object
## Class: mids
## Number of multiple imputations:  5 
## Imputation methods:
##                 SubID                 Group                   Age 
##                    ""                    ""                    "" 
##           Schoolgrade                Gender                  Race 
##                    ""                    ""                    "" 
##             Ethnicity                Income              Care_Edu 
##                    ""                 "pmm"                 "pmm" 
##     Reporter_relation Attachment_security_8 Attachment_security_4 
##                 "pmm"                 "pmm"                 "pmm" 
##                   PBS                  CBCL                WIAT_W 
##                 "pmm"                 "pmm"                 "pmm" 
##                WIAT_P                WIAT_N            WASI_VOCAB 
##                 "pmm"                 "pmm"                 "pmm" 
##           WASI_MATRIX                  crEA 
##                 "pmm"                    "" 
## PredictorMatrix:
##             SubID Group Age Schoolgrade Gender Race Ethnicity Income Care_Edu
## SubID           0     0   1           1      1    1         1      1        1
## Group           0     0   1           1      1    1         1      1        1
## Age             0     0   0           1      1    1         1      1        1
## Schoolgrade     0     0   1           0      1    1         1      1        1
## Gender          0     0   1           1      0    1         1      1        1
## Race            0     0   1           1      1    0         1      1        1
##             Reporter_relation Attachment_security_8 Attachment_security_4 PBS
## SubID                       1                     1                     1   1
## Group                       1                     1                     1   1
## Age                         1                     1                     1   1
## Schoolgrade                 1                     1                     1   1
## Gender                      1                     1                     1   1
## Race                        1                     1                     1   1
##             CBCL WIAT_W WIAT_P WIAT_N WASI_VOCAB WASI_MATRIX crEA
## SubID          1      1      1      1          1           1    1
## Group          1      1      1      1          1           1    1
## Age            1      1      1      1          1           1    1
## Schoolgrade    1      1      1      1          1           1    1
## Gender         1      1      1      1          1           1    1
## Race           1      1      1      1          1           1    1
## Number of logged events:  27 
##   it im dep     meth   out
## 1  0  0     constant SubID
## 2  0  0     constant Group
## 3  1  1 PBS      pmm Race5
## 4  1  2 PBS      pmm Race5
## 5  1  3 PBS      pmm Race5
## 6  1  4 PBS      pmm Race5

Conducting linear regression on imputed models using MICE

fit <- with(ini, lm(CBCL ~ crEA)) # linear regression on each imputed dataset separately
summary(pool(fit)) # combining information from multiple datasets
##          term   estimate std.error statistic       df      p.value
## 1 (Intercept) -0.2767081 0.1034556 -2.674654 301.0944 7.889412e-03
## 2        crEA  0.4882505 0.1235432  3.952062 290.1153 9.738233e-05

Identifying auxiliary variables

## Example for how to check for variables related to missigness. For the sake of today's workshop, we will include all variables in the imputation model (better to err on the side of more!). 

df = df %>% mutate (pbs_missing = is.na(df$PBS)) # create variable which specifies missingness

t.test(df$Age ~ df$pbs_missing) # identifying continuous variables related to missingness
## 
##  Welch Two Sample t-test
## 
## data:  df$Age by df$pbs_missing
## t = -0.14344, df = 133.1, p-value = 0.8862
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
##  -0.6098686  0.5273964
## sample estimates:
## mean in group FALSE  mean in group TRUE 
##            9.297283            9.338519
summary(glm(data = df, formula = pbs_missing ~ Gender, family = binomial(link='logit')))  # identifying categorical variables related to missingness
## 
## Call:
## glm(formula = pbs_missing ~ Gender, family = binomial(link = "logit"), 
##     data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9110  -0.9110  -0.6451   1.4696   1.8286  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.4639     0.1993  -7.345 2.06e-13 ***
## Gender1       0.7989     0.2603   3.069  0.00215 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 372.92  on 323  degrees of freedom
## Residual deviance: 363.20  on 322  degrees of freedom
## AIC: 367.2
## 
## Number of Fisher Scoring iterations: 4

Changing the prediction matrix

pred = ini$predictorMatrix # which variables predicted what - can be useful when you don't want certain variables to be predictors. MICE doesn't allow you to remove a variable from being predicted unless you also want to remove it from being a predictor (you will have to create your own prediction matrix for that)
## 1 represents that column variable was used to predict the row variable

pred[ , c('Attachment_security_4')] <- 0 # remove from predicting all variables
pred['PBS', 'Age'] <- 0  # remove specific predictors for specific outcomes

Calculating number of imputed datasets required

# percentage of missing data: minimum number of datasets to impute
mean(is.na(df$PBS))
## [1] 0.2623457
# fraction of missing information (FMI/m should  be less than 0.01)
fmi(df, method = "saturated", group = NULL, ords = NULL,
  varnames = NULL, exclude = NULL, fewImps = FALSE)
## $Covariances
## $Covariances$coef
##                       Age    Schlgr Income Car_Ed Rprtr_ Att__8 Att__4 PBS   
## Age                    4.563                                                 
## Schoolgrade            4.375  4.440                                          
## Income                -0.276 -0.336 19.778                                   
## Care_Edu              -0.185 -0.260  3.675  9.176                            
## Reporter_relation      0.066  0.071 -0.106 -0.257  0.169                     
## Attachment_security_8  0.049  0.065 -0.214  0.094 -0.025  0.914              
## Attachment_security_4  0.068  0.073 -0.193  0.069 -0.018  0.905  0.929       
## PBS                    0.140  0.081  0.425  0.477 -0.092  0.009  0.014  1.016
## CBCL                  -0.247 -0.285  0.003  0.093 -0.015 -0.084 -0.082 -0.212
## WIAT_W                 0.054  0.081  0.510  0.167 -0.034  0.058  0.029  0.071
## WIAT_P                 0.094  0.159  0.394  0.151  0.022  0.120  0.099  0.019
## WIAT_N                -0.139 -0.098  0.430  0.679 -0.025 -0.029 -0.061  0.005
## WASI_VOCAB            -0.101 -0.101  1.387  0.169 -0.052 -0.008 -0.020  0.158
## WASI_MATRIX            0.016  0.021  0.366  0.330 -0.009 -0.111 -0.114  0.092
## crEA                   0.079  0.069 -0.042  0.127  0.019 -0.006 -0.003 -0.072
##                       CBCL   WIAT_W WIAT_P WIAT_N WASI_V WASI_M crEA  
## Age                                                                   
## Schoolgrade                                                           
## Income                                                                
## Care_Edu                                                              
## Reporter_relation                                                     
## Attachment_security_8                                                 
## Attachment_security_4                                                 
## PBS                                                                   
## CBCL                   1.051                                          
## WIAT_W                -0.189  1.022                                   
## WIAT_P                -0.142  0.750  0.927                            
## WIAT_N                -0.161  0.624  0.482  1.056                     
## WASI_VOCAB            -0.020  0.465  0.395  0.411  0.945              
## WASI_MATRIX           -0.113  0.507  0.433  0.462  0.271  0.989       
## crEA                   0.105 -0.133 -0.077 -0.064 -0.056 -0.077  0.207
## 
## $Covariances$fmi
##                       Age   Schlgr Income Car_Ed Rprtr_ Att__8 Att__4 PBS  
## Age                   0.000                                                
## Schoolgrade           0.000 0.000                                          
## Income                0.054 0.062  0.038                                   
## Care_Edu              0.062 0.067  0.036  0.032                            
## Reporter_relation     0.053 0.059  0.031  0.047  0.046                     
## Attachment_security_8 0.001 0.001  0.018  0.018  0.046  0.000              
## Attachment_security_4 0.000 0.000  0.016  0.019  0.058  0.000  0.001       
## PBS                   0.280 0.292  0.143  0.299  0.379  0.253  0.264  0.255
## CBCL                  0.052 0.059  0.051  0.088  0.100  0.036  0.036  0.221
## WIAT_W                0.174 0.163  0.253  0.145  0.194  0.133  0.154  0.355
## WIAT_P                0.195 0.180  0.269  0.154  0.194  0.149  0.172  0.340
## WIAT_N                0.168 0.158  0.265  0.150  0.181  0.140  0.162  0.295
## WASI_VOCAB            0.011 0.009  0.029  0.036  0.045  0.014  0.012  0.250
## WASI_MATRIX           0.019 0.016  0.109  0.036  0.056  0.006  0.005  0.182
## crEA                  0.000 0.000  0.020  0.015  0.022  0.001  0.000  0.211
##                       CBCL  WIAT_W WIAT_P WIAT_N WASI_V WASI_M crEA 
## Age                                                                 
## Schoolgrade                                                         
## Income                                                              
## Care_Edu                                                            
## Reporter_relation                                                   
## Attachment_security_8                                               
## Attachment_security_4                                               
## PBS                                                                 
## CBCL                  0.054                                         
## WIAT_W                0.155 0.220                                   
## WIAT_P                0.166 0.215  0.220                            
## WIAT_N                0.191 0.214  0.212  0.230                     
## WASI_VOCAB            0.058 0.160  0.177  0.166  0.019              
## WASI_MATRIX           0.052 0.143  0.161  0.151  0.028  0.016       
## crEA                  0.045 0.219  0.253  0.242  0.010  0.013  0.000
## 
## 
## $Means
##                  variable group   coef   fmi
## 121                   Age     1  9.308 0.000
## 122           Schoolgrade     1  3.563 0.000
## 123                Income     1  4.434 0.034
## 124              Care_Edu     1 16.113 0.035
## 125     Reporter_relation     1  0.809 0.042
## 126 Attachment_security_8     1  0.027 0.001
## 127 Attachment_security_4     1  0.020 0.000
## 128                   PBS     1  0.049 0.259
## 129                  CBCL     1  0.077 0.048
## 130                WIAT_W     1  0.187 0.181
## 131                WIAT_P     1  0.139 0.202
## 132                WIAT_N     1  0.105 0.184
## 133            WASI_VOCAB     1  0.036 0.013
## 134           WASI_MATRIX     1 -0.051 0.009
## 135                  crEA     1  0.707 0.000

Conducting new imputations

imp_mypred <- mice(df, predictorMatrix = pred, seed = 329, print=F, maxit = 10, m = 10) # using our own prediction matrix -- change maxit and mice to increase number of iterations and number of imputed data sets respectively

imp_quickpred = mice(df, pred=quickpred(df, mincor=.3), seed = 329, print=F) # the second option is to use pre-set criteria for which variables shouldbe used to predict

plot(imp_mypred) # check convergence of data sets - want them to be free of any trends in later iterations

stripplot(imp_mypred, CBCL~.imp, pch = 20, cex = 2) # check reasonability of imputed values

summary(with(imp_mypred, mean(PBS)))  ## mean from  imputed data set
## # A tibble: 10 × 1
##            x
##        <dbl>
##  1 -0.000395
##  2  0.0575  
##  3  0.0610  
##  4  0.0287  
##  5  0.0562  
##  6  0.0149  
##  7  0.0452  
##  8  0.0806  
##  9 -0.0368  
## 10  0.0250
summary(mean(df$PBS, na.rm = TRUE)) ## mean from non-imputed dataset
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.03652 0.03652 0.03652 0.03652 0.03652 0.03652

Using imputed data set for specification curve analysis

# First - save the imputed data so that you don't have to do it again and again
save(imp_mypred, file = "impdata.rda")

# Load the data everytime you want to  use it
load("impdata.rda")

# For specification  curves - we will use the long form of the data
c.long = complete(imp_mypred, 'long') # convert imputed data into long form dataframe
c.broad = complete(imp_mypred, 'broad') # convert imputed data into broad form dataframe
## New names:
## * SubID -> SubID...1
## * Group -> Group...2
## * Age -> Age...3
## * Schoolgrade -> Schoolgrade...4
## * Gender -> Gender...5
## * ...

Specification curves on non-imputed data set

spec_nonimp = run_specs(df = df,
                    x = c("crEA"),
                    y = c("CBCL"),
                    controls = c("Age", "Gender", "Race", "Ethnicity"),
                    model = c("lm"),
                    all.comb = TRUE)
plot_specs(spec_nonimp) # visualize

Specification curves with imputations

Separate spec curve for each imputed data set

# running specification curve
spec_imp <- c.long %>% 
  group_by(.imp) %>% # group data based on which imputed data set is
  nest() %>% # creates data frames within data frames
  mutate(specs = purrr::map(data, ~run_specs(df = .,
                    x = c("crEA"),
                    y = c("CBCL"),
                    controls = c("Age", "Gender", "Race", "Ethnicity"),
                    model = c("lm"),
                    all.comb = TRUE))) # create a separate specification curve for each imputed data set

Combining results across imputed data sets

# unnest specification curves

unnested_specs <- spec_imp %>% unnest(specs) # 'takes out' the specifications from the dataframe

# Function for pooling standard deviation across the imputations

pooled_sd = function(means, sds){
  n = length(means)
  variance_within = (1/n)*sum(sds^2)
  variance_between = var(means)
  variance_total = variance_within + variance_between + (variance_between/n)
  return(sqrt(variance_total))
}

#### grouping and getting mean estimate (rubin's rules)

mean_estimate <- unnested_specs %>% 
  group_by(controls, y, x, model, subsets) %>% 
  summarize(std.error = pooled_sd(means = estimate, 
                                  sds = std.error),
            estimate = mean(estimate)
  ) %>% 
  mutate(conf.low = estimate-2*std.error, 
         conf.high = estimate+2*std.error, 
         statistic = estimate/std.error) %>% 
  ungroup()
## `summarise()` has grouped output by 'controls', 'y', 'x', 'model'. You can override using the `.groups` argument.
plot_specs(mean_estimate) # visualize 

Another example: CBCL ~ Attachment security

## running specification curve
spec_imp_attach <- c.long %>% 
  group_by(.imp) %>% # group data based on which imputed data set is
  nest() %>% # creates data frames within data frames
  mutate(specs = purrr::map(data, ~run_specs(df = .,
                    x = c("Attachment_security_4", "Attachment_security_8"),
                    y = c("CBCL"),
                    controls = c("Age", "Gender", "Race", "Ethnicity"),
                    model = c("lm"),
                    all.comb = TRUE))) # create a separate specification curve for each imputed data set

#### unnest specification curves

unnested_specs_attach <- spec_imp_attach %>% unnest(specs) # 'takes out' the specifications from the dataframe

#### grouping and getting mean estimate (rubin's rules)

mean_estimate_attach <- unnested_specs_attach %>% 
  group_by(controls, y, x, model, subsets) %>% 
  summarize(std.error = pooled_sd(means = estimate, 
                                  sds = std.error),
            estimate = mean(estimate)
  ) %>% 
  mutate(conf.low = estimate-2*std.error, 
         conf.high = estimate+2*std.error, 
         statistic = estimate/std.error) %>% 
  ungroup()
## `summarise()` has grouped output by 'controls', 'y', 'x', 'model'. You can override using the `.groups` argument.
plot_specs(mean_estimate_attach) # visualize